cancer_df = read_csv("./data/Cancer_Registry.csv") %>%
janitor::clean_names() %>%
select(target_death_rate, everything()) %>%
separate(geography, c("county", "state"), sep = ",") %>%
mutate(county = as.factor(county),
state = as.factor(state),
pct_case_count = avg_ann_count / pop_est2015*100000,
pct_hs = pct_no_hs18_24 + pct_hs18_24,
pct_bach_deg = pct_bach_deg18_24 + pct_bach_deg25_over) %>%
filter(median_age<100) %>%
select(target_death_rate, pct_case_count, everything())
## Parsed with column specification:
## cols(
## .default = col_double(),
## avgDeathsPerYear = col_integer(),
## medIncome = col_integer(),
## popEst2015 = col_integer(),
## binnedInc = col_character(),
## Geography = col_character()
## )
## See spec(...) for full column specifications.
Percentage of annul case dignosed count plot
plot_count_pct =
cancer_df %>%
ggplot(aes(y = pct_case_count, x = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_count_pct)
Incidence rate plot
plot_incidence =
cancer_df %>%
ggplot(aes(x = incidence_rate, y = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_incidence)
# Influential points in the dataset, state Flordia and Virginia.
Income plot
plot_income =
cancer_df %>%
ggplot(aes(x = med_income, y = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_income)
Age plots
plot_age_1 =
cancer_df %>%
ggplot(aes(x = median_age, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# error data in this column, larger than 100
cancer_df %>%
filter(median_age < 100) %>%
ggplot(aes(x = median_age)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot_age_2 =
cancer_df %>%
ggplot(aes(x = median_age_male, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_age_3 =
cancer_df %>%
ggplot(aes(x = median_age_female, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cancer_df %>%
select(-county, -state, -binned_inc) %>%
cor() %>%
as.tibble()
## # A tibble: 35 x 35
## target_death_ra… pct_case_count avg_ann_count avg_deaths_per_…
## <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.0551 -0.143 -0.0904
## 2 -0.0551 1 0.161 -0.0589
## 3 -0.143 0.161 1 0.940
## 4 -0.0904 -0.0589 0.940 1
## 5 0.448 0.0230 0.0742 0.0631
## 6 -0.428 0.0278 0.269 0.223
## 7 -0.119 -0.0518 0.927 0.978
## 8 0.429 -0.123 -0.135 -0.0667
## 9 -0.0225 -0.00419 0.0819 0.0633
## 10 -0.00429 0.124 -0.122 -0.145
## # ... with 25 more rows, and 31 more variables: incidence_rate <dbl>,
## # med_income <dbl>, pop_est2015 <dbl>, poverty_percent <dbl>,
## # study_per_cap <dbl>, median_age <dbl>, median_age_male <dbl>,
## # median_age_female <dbl>, avg_household_size <dbl>,
## # percent_married <dbl>, pct_no_hs18_24 <dbl>, pct_hs18_24 <dbl>,
## # pct_some_col18_24 <dbl>, pct_bach_deg18_24 <dbl>, pct_hs25_over <dbl>,
## # pct_bach_deg25_over <dbl>, pct_employed16_over <dbl>,
## # pct_unemployed16_over <dbl>, pct_private_coverage <dbl>,
## # pct_private_coverage_alone <dbl>, pct_emp_priv_coverage <dbl>,
## # pct_public_coverage <dbl>, pct_public_coverage_alone <dbl>,
## # pct_white <dbl>, pct_black <dbl>, pct_asian <dbl>,
## # pct_other_race <dbl>, pct_married_households <dbl>, birth_rate <dbl>,
## # pct_hs <dbl>, pct_bach_deg <dbl>
lm(target_death_rate ~ incidence_rate + med_income * pct_bach_deg25_over + pct_unemployed16_over*poverty_percent + pct_public_coverage_alone , data = cancer_df) %>%
summary()
##
## Call:
## lm(formula = target_death_rate ~ incidence_rate + med_income *
## pct_bach_deg25_over + pct_unemployed16_over * poverty_percent +
## pct_public_coverage_alone, data = cancer_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.865 -11.770 0.047 11.648 137.687
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.035e+02 8.133e+00 12.730
## incidence_rate 2.161e-01 6.775e-03 31.899
## med_income -4.662e-04 1.205e-04 -3.870
## pct_bach_deg25_over -2.719e+00 2.479e-01 -10.967
## pct_unemployed16_over 1.217e+00 2.868e-01 4.244
## poverty_percent 7.786e-01 1.841e-01 4.229
## pct_public_coverage_alone 1.924e-01 1.145e-01 1.680
## med_income:pct_bach_deg25_over 2.148e-05 4.502e-06 4.771
## pct_unemployed16_over:poverty_percent -3.050e-02 1.229e-02 -2.481
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## incidence_rate < 2e-16 ***
## med_income 0.000111 ***
## pct_bach_deg25_over < 2e-16 ***
## pct_unemployed16_over 2.26e-05 ***
## poverty_percent 2.41e-05 ***
## pct_public_coverage_alone 0.093006 .
## med_income:pct_bach_deg25_over 1.92e-06 ***
## pct_unemployed16_over:poverty_percent 0.013149 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.13 on 3008 degrees of freedom
## Multiple R-squared: 0.4747, Adjusted R-squared: 0.4733
## F-statistic: 339.8 on 8 and 3008 DF, p-value: < 2.2e-16